import pandas as pd # Load dataset df = pd.read_csv(r"data/raw/regional_sales_extended.csv") # ---- BASIC INSPECTION ---- print("Columns:", df.columns.tolist()) print(f"Total records: {len(df)}") print(f"Date range: {df['Date'].min()} to {df['Date'].max()}") print(f"Regions: {df['Region'].unique()}") # ---- DATA CLEANING ---- # Convert Date column to datetime df["Date"] = pd.to_datetime(df["Date"], errors="coerce") # Remove rows with missing critical values df = df.dropna(subset=["Date", "Sales", "Region"]) # Ensure numeric sales df["Sales"] = pd.to_numeric(df["Sales"], errors="coerce") # Remove invalid sales (zero or negative) df = df[df["Sales"] > 0] print(f"\nRecords after cleaning: {len(df)}") # ---- AGGREGATE BY MONTH AND REGION ---- # This preserves regional breakdown for individual forecasting monthly_regional_sales = ( df .groupby([pd.Grouper(key="Date", freq="M"), "Region"]) .agg({"Sales": "sum"}) .reset_index() ) # Rename Date column to Month for clarity monthly_regional_sales.rename(columns={"Date": "Month"}, inplace=True) print("\nMonthly regional sales sample:") print(monthly_regional_sales.head(10)) print(f"\nTotal months per region: {monthly_regional_sales.groupby('Region').size()}") # ---- SAVE FOR FORECASTING ---- monthly_regional_sales.to_csv( r"data/Processed/monthly_regional_sales.csv", index=False ) print("\nProcessed file saved: data/Processed/monthly_regional_sales.csv")